This report was generated on 2022-10-14 14:19:43. R version: 4.2.0 on x86_64-apple-darwin17.0. For this report, CRAN packages as of 2022-06-01 were used.
…
The preprocessing and analysis of the data was conducted in the R project for statistical
computing. The RMarkdown script used to generate this document and
all the resulting data can be downloaded under
this link. Through executing main.Rmd, the herein
described process can be reproduced and this document can be generated.
In the course of this, data from the folder input will be
processed and results will be written to output. The html
on-line version of the analysis can be accessed through this link.
The code for the herein described process can also be freely downloaded from https://github.com/fernandomillanvillalobos/r-data-visualization.
…
abc.csv (Example)| Attribute | Type | Description |
|---|---|---|
| a | Numeric | … |
| b | Numeric | … |
| c | Numeric | … |
xyz.csv…
## [1] "package package:rmarkdown detached"
# from https://mran.revolutionanalytics.com/web/packages/\
# checkpoint/vignettes/using-checkpoint-with-knitr.html
# if you don't need a package, remove it from here (commenting not sufficient)
# tidyverse: see https://blog.rstudio.org/2016/09/15/tidyverse-1-0-0/
cat("
library(rstudioapi)
library(tidyverse)
library(scales)
library(lintr)
library(rmarkdown)
library(cowplot)
library(extrafont)
library(sf)
library(ggrepel)
library(gapminder)
library(socviz)
library(RColorBrewer)
library(ggforce)
library(dichromat)
library(ggridges)
library(viridis)
library(palmerpenguins)
library(lubridate)
library(ggthemes)
library(nycflights13)
library(broom)
library(ggiraph)
library(hexbin)
library(patchwork)
library(distributional)
library(psych)
library(ggalluvial)
library(ggdist)
library(ds4psy)
library(unikn)
library(ISLR)
library(MASS)
library(introdataviz)
library(nlme)
library(ozmaps)
library(rmapshaper)
library(bomrang)
library(elevatr)
library(giscoR)
library(tidygraph)
library(ggraph)
library(seriation)
library(ggtext)
library(directlabels)
library(ggforce)
library(gghighlight)
library(gganimate)",
file = "manifest.R")# if checkpoint is not yet installed, install it (for people using this
# system for the first time)
if (!require(checkpoint)) {
if (!require(devtools)) {
install.packages("devtools", repos = "http://cran.us.r-project.org")
require(devtools)
}
devtools::install_github("RevolutionAnalytics/checkpoint",
ref = "v0.3.2", # could be adapted later,
# as of now (beginning of July 2017
# this is the current release on CRAN)
repos = "http://cran.us.r-project.org")
require(checkpoint)
}
# nolint start
if (!dir.exists("~/.checkpoint")) {
dir.create("~/.checkpoint")
}
# nolint end
# install packages for the specified CRAN snapshot date
checkpoint(snapshot_date = package_date,
project = path_to_wd,
verbose = T,
scanForPackages = T,
use.knitr = F,
R.version = r_version)
rm(package_date)source("manifest.R")
unlink("manifest.R")
sessionInfo()## R version 4.2.0 (2022-04-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] C/UTF-8/C/C/C/C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] gganimate_1.0.7 gghighlight_0.3.2 directlabels_2021.1.13
## [4] ggtext_0.1.1 seriation_1.3.5 ggraph_2.0.5
## [7] tidygraph_1.2.1 giscoR_0.3.2 elevatr_0.4.2
## [10] bomrang_0.7.4 rmapshaper_0.4.6 ozmaps_0.4.5
## [13] nlme_3.1-157 introdataviz_0.0.0.9003 MASS_7.3-57
## [16] ISLR_1.4 unikn_0.4.0 ds4psy_0.8.0
## [19] ggdist_3.1.1 ggalluvial_0.12.3 psych_2.2.5
## [22] distributional_0.3.0 patchwork_1.1.1 hexbin_1.28.2
## [25] ggiraph_0.8.2 broom_0.8.0 nycflights13_1.0.2
## [28] ggthemes_4.2.4 lubridate_1.8.0 palmerpenguins_0.1.0
## [31] viridis_0.6.2 viridisLite_0.4.1 ggridges_0.5.3
## [34] dichromat_2.0-0.1 ggforce_0.3.3 RColorBrewer_1.1-3
## [37] socviz_1.2 gapminder_0.3.0 ggrepel_0.9.1
## [40] sf_1.0-7 extrafont_0.18 cowplot_1.1.1
## [43] rmarkdown_2.16 lintr_2.0.1 scales_1.2.1
## [46] forcats_0.5.2 stringr_1.4.1 dplyr_1.0.9
## [49] purrr_0.3.4 readr_2.1.2 tidyr_1.2.0
## [52] tibble_3.1.8 ggplot2_3.3.6 tidyverse_1.3.2
## [55] checkpoint_1.0.2 rstudioapi_0.14 knitr_1.40
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2 tidyselect_1.1.2 htmlwidgets_1.5.4
## [4] grid_4.2.0 TSP_1.2-0 hoardr_0.5.2
## [7] munsell_0.5.0 codetools_0.2-18 units_0.8-0
## [10] withr_2.5.0 colorspace_2.0-3 progressr_0.11.0
## [13] uuid_1.1-0 Rttf2pt1_1.3.10 mnormt_2.1.0
## [16] polyclip_1.10-0 farver_2.1.1 rprojroot_2.0.3
## [19] vctrs_0.4.2 generics_0.1.3 xfun_0.33
## [22] R6_2.5.1 graphlayouts_0.8.0 rex_1.2.1
## [25] cachem_1.0.6 assertthat_0.2.1 googlesheets4_1.0.1
## [28] gtable_0.3.0 processx_3.7.0 rlang_1.0.6
## [31] cyclocomp_1.1.0 systemfonts_1.0.4 extrafontdb_1.0
## [34] lazyeval_0.2.2 gargle_1.2.0 yaml_2.3.5
## [37] modelr_0.1.9 backports_1.4.1 gridtext_0.1.4
## [40] tools_4.2.0 oz_1.0-21 ellipsis_0.3.2
## [43] jquerylib_0.1.4 proxy_0.4-26 jsonvalidate_1.3.2
## [46] Rcpp_1.0.9 plyr_1.8.7 progress_1.2.2
## [49] classInt_0.4-3 prettyunits_1.1.1 ps_1.7.1
## [52] haven_2.5.1 fs_1.5.2 crul_1.2.0
## [55] magrittr_2.0.3 magick_2.7.3 data.table_1.14.2
## [58] reprex_2.0.2 googledrive_2.0.0 hms_1.1.2
## [61] evaluate_0.16 readxl_1.4.1 gridExtra_2.3
## [64] compiler_4.2.0 KernSmooth_2.23-20 V8_4.2.0
## [67] crayon_1.5.2 htmltools_0.5.3 tzdb_0.3.0
## [70] DBI_1.1.3 tweenr_1.0.2 dbplyr_2.2.1
## [73] rappdirs_0.3.3 cli_3.4.1 quadprog_1.5-8
## [76] parallel_4.2.0 igraph_1.3.1 pkgconfig_2.0.3
## [79] registry_0.5-1 sp_1.5-0 terra_1.6-28
## [82] xml2_1.3.3 foreach_1.5.2 bslib_0.4.0
## [85] geojsonlint_0.4.0 rvest_1.0.3 callr_3.7.2
## [88] digest_0.6.29 httpcode_0.3.0 cellranger_1.1.0
## [91] curl_4.3.2 lifecycle_1.0.2 jsonlite_1.8.2
## [94] desc_1.4.1 fansi_1.0.3 pillar_1.8.1
## [97] lattice_0.20-45 fastmap_1.1.0 httr_1.4.4
## [100] glue_1.6.2 remotes_2.4.2 iterators_1.0.14
## [103] class_7.3-20 stringi_1.7.8 sass_0.4.2
## [106] e1071_1.7-9
# if you want to outsource logic to other script files, see README for
# further information
# Load all visualizations functions as separate scripts
knitr::read_chunk("scripts/dviz.supp.R")
source("scripts/dviz.supp.R")
knitr::read_chunk("scripts/themes.R")
source("scripts/themes.R")
knitr::read_chunk("scripts/plot_grid.R")
source("scripts/plot_grid.R")
knitr::read_chunk("scripts/align_legend.R")
source("scripts/align_legend.R")
knitr::read_chunk("scripts/label_log10.R")
source("scripts/label_log10.R")
knitr::read_chunk("scripts/outliers.R")
source("scripts/outliers.R")Concepts of the {ggplot2} Package: Part 1
ggplot2 is an R package for producing statistical, or data, graphics. Unlike most other graphics packages, ggplot2 has an underlying grammar, based on the Grammar of Graphics, that allows you to compose graphs by combining independent components. This makes ggplot2 powerful. Rather than being limited to sets of pre-defined graphics, you can create novel graphics that are tailored to your specific problem. While the idea of having to learn a grammar may sound overwhelming, ggplot2 is actually easy to learn: there is a simple set of core principles and there are very few special cases. The hard part is that it may take a little time to forget all the preconceptions that you bring over from using other graphics tools.
ggplot2 provides beautiful, hassle-free plots that take care of fiddly details like drawing legends. In fact, its carefully chosen defaults mean that you can produce publication-quality graphics in seconds. However, if you do have special formatting requirements, ggplot2’s comprehensive theming system makes it easy to do what you want. Ultimately, this means that rather than spending your time making your graph look pretty, you can instead focus on creating the graph that best reveals the message in your data.
ggplot2 is designed to work iteratively. You start with a layer that shows the raw data. Then you add layers of annotations and statistical summaries. This allows you to produce graphics using the same structured thinking that you would use to design an analysis. This reduces the distance between the plot in your head and the one on the page.
Learning the grammar will not only help you create graphics that you’re familiar with, but will also help you to create newer, better graphics. Without a grammar, there is no underlying theory, so most graphics packages are just a big collection of special cases. For example, in base R, if you design a new graphic, it’s composed of raw plot elements like lines and points so it’s hard to design new components that combine with existing plots. In ggplot2, the expressions used to create a new graphic are composed of higher-level elements, like representations of the raw data and statistical transformations, that can easily be combined with new datasets and other plots.
Wilkinson created the grammar of graphics to describe the fundamental features that underlie all statistical graphics. The grammar of graphics is an answer to the question of what is a statistical graphic? ggplot2 builds on Wilkinson’s grammar by focussing on the primacy of layers and adapting it for use in R. In brief, the grammar tells us that a graphic maps the data to the aesthetic attributes (colour, shape, size) of geometric objects (points, lines, bars). The plot may also include statistical transformations of the data and information about the plot’s coordinate system. Facetting can be used to plot for different subsets of the data. The combination of these independent components are what make up a graphic.
All plots are composed of the data, the information you want to visualise, and a mapping, the description of how the data’s variables are mapped to aesthetic attributes. There are five mapping components:
A layer is a collection of geometric elements and statistical transformations. Geometric elements, geoms for short, represent what you actually see in the plot: points, lines, polygons, etc. Statistical transformations, stats for short, summarise the data: for example, binning and counting observations to create a histogram, or fitting a linear model.
Scales map values in the data space to values in the aesthetic space. This includes the use of colour, shape or size. Scales also draw the legend and axes, which make it possible to read the original data values from the plot (an inverse mapping).
A coord, or coordinate system, describes how data coordinates are mapped to the plane of the graphic. It also provides axes and gridlines to help read the graph. We normally use the Cartesian coordinate system, but a number of others are available, including polar coordinates and map projections.
A facet specifies how to break up and display subsets of data as small multiples. This is also known as conditioning or latticing/trellising.
A theme controls the finer points of display, like the font size and background colour. While the defaults in ggplot2 have been chosen with care, you may need to consult other references to create an attractive plot.
It’s also important to note what the grammar doesn’t do:
It doesn’t suggest which graphics to use. While this book endeavours to promote a sensible process for producing plots, the focus is on how to produce the plots you want, not on which plot to produce.
It doesn’t describe interactive graphics, only static ones. There is essentially no difference between displaying ggplot2 graphs on a computer screen and printing them on a piece of paper.
There are a number of other graphics systems available in R: base graphics, grid graphics and trellis/lattice graphics. How does ggplot2 differ from them?
Base graphics were written by Ross Ihaka based on experience implementing the S graphics driver and partly looking at Chambers et al. Base graphics has a pen on paper model: you can only draw on top of the plot, you cannot modify or delete existing content. There is no (user accessible) representation of the graphics, apart from their appearance on the screen. Base graphics includes both tools for drawing primitives and entire plots. Base graphics functions are generally fast, but have limited scope. If you’ve created a single scatterplot, or histogram, or a set of boxplots in the past, you’ve probably used base graphics.
The development of “grid” graphics, a much richer system of graphical primitives, started in 2000. Grid is developed by Paul Murrell, growing out of his PhD work. Grid grobs (graphical objects) can be represented independently of the plot and modified later. A system of viewports (each containing its own coordinate system) makes it easier to lay out complex graphics. Grid provides drawing primitives, but no tools for producing statistical graphics.
The lattice package, developed by Deepayan Sarkar, uses grid graphics to implement the trellis graphics system of Cleveland and is a considerable improvement over base graphics. You can easily produce conditioned plots and some plotting details (e.g., legends) are taken care of automatically. However, lattice graphics lacks a formal model, which can make it hard to extend. Lattice graphics are explained in depth in Deepayan Sarkar.
ggplot2, started in 2005, is an attempt to take the good things about base and lattice graphics and improve on them with a strong underlying model which supports the production of any kind of statistical graphic, based on the principles outlined above. The solid underlying model of ggplot2 makes it easy to describe a wide range of graphics with a compact syntax, and independent components make extension easy. Like lattice, ggplot2 uses grid to draw the graphics, which means you can exercise much low-level control over the appearance of the plot.
htmlwidgets, http://www.htmlwidgets.org, provides a common framework for accessing web visualisation tools from R. Packages built on top of htmlwidgets include leaflet (https://rstudio.github.io/leaflet/, maps), dygraph (http://rstudio.github.io/dygraphs/, time series) and networkD3 (http://christophergandrud.github.io/networkD3/, networks).
plotly, https://plotly-r.com, is a popular javascript visualisation toolkit with an R interface. It’s a great tool if you want to make interactive graphics for HTML documents, and even comes with a ggplotly() function that can convert many ggplot2 graphics into their interactive equivalents.
The layered structure of ggplot2 encourages you to design and construct graphics in a structured manner. It is useful to think about the purpose of each layer before it is added. In general, there are three purposes for a layer:
To display the data. We plot the raw data for many reasons, relying on our skills at pattern detection to spot gross structure, local structure, and outliers. This layer appears on virtually every graphic. In the earliest stages of data exploration, it is often the only layer.
To display a statistical summary of the data. As we develop and explore models of the data, it is useful to display model predictions in the context of the data. Showing the data helps us improve the model, and showing the model helps reveal subtleties of the data that we might otherwise miss. Summaries are usually drawn on top of the data.
To add additional metadata: context, annotations, and references. A metadata layer displays background context, annotations that help to give meaning to the raw data, or fixed references that aid comparisons across panels. Metadata can be useful in the background and foreground.
A map is often used as a background layer with spatial data. Background metadata should be rendered so that it doesn’t interfere with your perception of the data, so is usually displayed underneath the data and formatted so that it is minimally perceptible. That is, if you concentrate on it, you can see it with ease, but it doesn’t jump out at you when you are casually browsing the plot.
Other metadata is used to highlight important features of the data. If you have added explanatory labels to a couple of inflection points or outliers, then you want to render them so that they pop out at the viewer. In that case, you want this to be the very last layer drawn.
These geoms are the fundamental building blocks of ggplot2. They are useful in their own right, but are also used to construct more complex geoms. Most of these geoms are associated with a named plot: when that geom is used by itself in a plot, that plot has a special name.
Each of these geoms is two dimensional and requires both x
and y aesthetics. All of them understand colour
(or color) and size aesthetics, and the filled
geoms (bar, tile and polygon) also understand fill.
geom_area() draws an area plot, which is a line plot
filled to the y-axis (filled lines). Multiple groups will be stacked on
top of each other.
geom_bar(stat = "identity") makes a bar
chart. We need stat = "identity" because the
default stat automatically counts values (so is essentially a
1d geom. The identity stat leaves the data unchanged.
Multiple bars in the same location will be stacked on top of one
another.
geom_line() makes a line plot. The group
aesthetic determines which observations are connected;
geom_line() connects points from left to right;
geom_path() is similar but connects points in the order
they appear in the data. Both geom_line() and
geom_path() also understand the aesthetic
linetype, which maps a categorical variable to solid,
dotted and dashed lines.
geom_point() produces a scatterplot.
geom_point() also understands the shape aesthetic.
geom_polygon() draws polygons, which are filled
paths. Each vertex of the polygon requires a separate row in the data.
It is often useful to merge a data frame of polygon coordinates with the
data just prior to plotting.
geom_rect(), geom_tile() and
geom_raster() draw rectangles.
geom_rect() is parameterised by the four corners of the
rectangle, xmin, ymin, xmax and ymax. geom_tile() is
exactly the same, but parameterised by the center of the rect and its
size, x, y, width and height. geom_raster() is a fast
special case of geom_tile() used when all the tiles are the
same size.
# getting the data
df <- data.frame(
x = c(3, 1, 5),
y = c(2, 4, 6),
label = c("a","b","c")
)
# basic geoms
p <- ggplot(df, aes(x, y, label = label)) +
labs(x = NULL, y = NULL) + # Hide axis label
theme(plot.title = element_text(size = 12)) # Shrink plot title
p + geom_point() + ggtitle("point")p + geom_text() + ggtitle("text")# these geoms take up space outside the range of the data, and so push the axes out
p + geom_bar(stat = "identity") + ggtitle("bar") p + geom_tile() + ggtitle("raster")p + geom_area() + ggtitle("area")p + geom_line() + ggtitle("line")p + geom_path() + ggtitle("path")p + geom_polygon() + ggtitle("polygon")Geoms can be roughly divided into individual and collective geoms. An
individual geom draws a distinct graphical object for
each observation (row). For example, the point geom draws one point per
row. A collective geom displays multiple observations
with one geometric object. This may be a result of a statistical
summary, like a boxplot, or may be fundamental to the display of the
geom, like a polygon. Lines and paths fall somewhere in between: each
line is composed of a set of straight segments, but each segment
represents two points. How do we control the assignment of observations
to graphical elements? This is the job of the group
aesthetic.
By default, the group aesthetic is mapped to the
interaction of all discrete variables in the plot. This
often partitions the data correctly, but when it does not, or when no
discrete variable is used in a plot, you’ll need to explicitly define
the grouping structure by mapping group to a variable that has a
different value for each group.
There are three common cases where the default is not enough, and we
will consider each one below. In the following examples, we will use a
simple longitudinal data set, Oxboys, from the nlme
package. It records the heights (height) and centered ages
(age) of 26 boys (Subject), measured on nine
occasions (Occasion). Subject and
Occassion are stored as ordered factors.
head(Oxboys)## Grouped Data: height ~ age | Subject
## Subject age height Occasion
## 1 1 -1.0000 140.5 1
## 2 1 -0.7479 143.4 2
## 3 1 -0.4630 144.8 3
## 4 1 -0.1643 147.1 4
## 5 1 -0.0027 147.7 5
## 6 1 0.2466 150.2 6
In many situations, you want to separate your data into groups, but
render them in the same way. In other words, you want to be able to
distinguish individual subjects, but not identify them. This is common
in longitudinal studies with many subjects, where the plots are often
descriptively called spaghetti plots. For example, the following plot
shows the growth trajectory for each boy (each Subject). If
a group isn’t defined by a single variable, but instead by a combination
of multiple variables, use interaction() to combine them,
e.g. aes(group = interaction(school_id, student_id)).
# grouping a variable
ggplot(Oxboys, aes(age, height, group = Subject)) +
geom_point() +
geom_line()# not grouping
ggplot(Oxboys, aes(age, height)) +
geom_point() +
geom_line()Sometimes we want to plot summaries that use different levels of
aggregation: one layer might display individuals, while another displays
an overall summary. Building on the previous example, suppose we want to
add a single smooth line, showing the overall trend for all
boys. If we use the same grouping in both layers, we get one smooth per
boy. Instead of setting the grouping aesthetic in ggplot(),
where it will apply to all layers, we set it in geom_line()
so it applies only to the lines. There are no discrete variables in the
plot so the default grouping variable will be a constant and we get one
smooth. The group aesthetic is usually only needed when the grouping
information you need to tell ggplot about is not built into the
variables being mapped.
# wrong! we have inadvertently added a smoothed line for each boy. Grouping controls both the display of the geoms, and the operation of the stats: one statistical transformation is run for each group
ggplot(Oxboys, aes(age, height, group = Subject)) +
geom_line() +
geom_smooth(method = "lm", se = FALSE)# applying ONLY to geom_line
ggplot(Oxboys, aes(age, height)) +
geom_line(aes(group = Subject)) +
geom_smooth(method = "lm", size = 2, se = FALSE)# another example
p <- ggplot(
data = gapminder,
mapping = aes(
x = year,
y = gdpPercap
)
)
p + geom_line(aes(group = country))Some plots have a discrete x scale, but you still want to draw lines
connecting across groups. This is the strategy used in
interaction plots, profile plots, and parallel coordinate plots, among
others. There is one discrete variable in this plot, Occasion, so we get
one boxplot for each unique x value. Now we want to overlay lines that
connect each individual boy. Simply adding geom_line() does
not work: the lines are drawn within each occassion, not across each
subject.
# drawing boxplots of height at each measurement occasion
ggplot(Oxboys, aes(Occasion, height)) +
geom_boxplot()# not work!
ggplot(Oxboys, aes(Occasion, height)) +
geom_boxplot() +
geom_line(colour = "#3366FF", alpha = 0.5)# overriding the grouping to say we want one line per boy
ggplot(Oxboys, aes(Occasion, height)) +
geom_boxplot() +
geom_line(aes(group = Subject), colour = "#3366FF", alpha = 0.5)A final important issue with collective geoms is how the aesthetics of the individual observations are mapped to the aesthetics of the complete entity. What happens when different aesthetics are mapped to a single geometric element?
In ggplot2, this is handled differently for different collective
geoms. Lines and paths operate on a “first value”
principle: each segment is defined by two observations, and ggplot2
applies the aesthetic value (e.g., colour) associated with
the first observation when drawing the segment. That is, the aesthetic
for the first observation is used when drawing the first segment, the
second observation is used when drawing the second segment and so on.
The aesthetic value for the last observation is not used. An additional
limitation for paths and lines is worth noting: the line type must be
constant over each individual line. In R there is no way to draw a line
which has varying line type.
What about other collective geoms, such as polygons? Most collective
geoms are more complicated than lines and path, and a single geometric
object can map onto many observations. In such cases it is not obvious
how the aesthetics of individual observations should be combined. Due to
this ambiguity ggplot2 adopts a simple rule: the aesthetics from the
individual components are used only if they are all the same. If the
aesthetics differ for each component, ggplot2 uses a default value
instead. These issues are most relevant when mapping aesthetics to
continuous variables. For discrete variables, the default behaviour of
ggplot2 is to treat the variable as part of the group
aesthetic, as described above. This has the effect of splitting the
collective geom into smaller pieces. This works particularly well for
bar and area plots, because stacking the individual pieces produces the
same shape as the original ungrouped data.
If you try to map the fill aesthetic to a continuous
variable (e.g., hwy) in the same way, it doesn’t work. The default
grouping will only be based on class, so each bar is now associated with
multiple colours (depending on the value of hwy for the observations in
each class). Because a bar can only display one colour, ggplot2 reverts
to the default grey in this case. To show multiple colours, we need
multiple bars for each class, which we can get by overriding the
grouping.
# getting the data
df <- data.frame(x = 1:3, y = 1:3, colour = c(1,3,5))
# where colour is discrete
ggplot(df, aes(x, y, colour = factor(colour))) +
geom_line(aes(group = 1), size = 2) +
geom_point(size = 5)# where colour is continous (even though the colour variable is continuous, ggplot2 does not smoothly blend from one aesthetic value to another. If this is the behaviour you want, you can perform the linear interpolation yourself)
ggplot(df, aes(x, y, colour = colour)) +
geom_line(aes(group = 1), size = 2) +
geom_point(size = 5)# default value
ggplot(mpg, aes(class)) +
geom_bar()# variable as a part of the group aesthetic
ggplot(mpg, aes(class, fill = drv)) +
geom_bar()# hwy = continous variable
ggplot(mpg, aes(class, fill = hwy)) +
geom_bar()# overriding the grouping
ggplot(mpg, aes(class, fill = hwy, group = hwy)) +
geom_bar()# creating a factor with levels ordered
ggplot(mpg, aes(class, fill = factor(ordered(hwy)), group = hwy)) +
geom_bar()If you have information about the uncertainty present in your data, whether it be from a model or from distributional assumptions, it’s a good idea to display it. There are four basic families of geoms that can be used for this job, depending on whether the x values are discrete or continuous, and whether or not you want to display the middle of the interval, or just the extent.
geom_errorbar(),
geom_linerange()geom_crossbar(),
geom_pointrange()geom_ribbon()geom_smooth(stat = "identity")These geoms assume that you are interested in the distribution of y
conditional on x and use the aesthetics ymin and
ymax to determine the range of the y values. Most
statistics and geoms assume you are interested in y values conditional
on x values (e.g., smooth, summary, boxplot, line): in most statistical
models, the x values are assumed to be measured without error. If you
are interested in x conditional on y (or you just want to rotate the
plot 90 degrees), you can use coord_flip() to exchange the
x and y axes.
Because there are so many different ways to calculate standard errors, the calculation is up to you. For very simple cases, ggplot2 provides some tools in the form of summary functions described below, otherwise you will have to do it yourself.
# getting the data
y <- c(18, 11, 16)
df <- data.frame(x = 1:3, y = y, se = c(1.2, 0.5, 1.0))
# geoms to show uncertainty
base <- ggplot(df, aes(x, y, ymin = y - se, ymax = y + se))
base + geom_crossbar()base + geom_pointrange()base + geom_smooth(stat = "identity")base + geom_errorbar()base + geom_linerange()base + geom_ribbon()When you have aggregated data where each row in the data set represents multiple observations, you need some way to take into account the weighting variable. There are a few different things we might want to weight by:
The choice of a weighting variable profoundly affects what we are
looking at in the plot and the conclusions that we will draw. There are
two aesthetic attributes that can be used to adjust for weights.
Firstly, for simple geoms like lines and points, use the
size aesthetic. For more complicated geoms which involve
some statistical transformation, we specify weights with the
weight aesthetic. These weights will be passed on to the
statistical summary function. Weights are supported for every case where
it makes sense: smoothers, quantile regressions, boxplots, histograms,
and density plots. You can’t see this weighting variable directly, and
it doesn’t produce a legend, but it will change the results of the
statistical summary.
When we weight a histogram or density plot by total population, we change from looking at the distribution of the number of counties, to the distribution of the number of people.
# Unweighted
ggplot(midwest, aes(percwhite, percbelowpoverty)) +
geom_point()# Weight by population
ggplot(midwest, aes(percwhite, percbelowpoverty)) +
geom_point(aes(size = poptotal / 1e6)) +
scale_size_area("Population\n(millions)", breaks = c(0.5, 1, 2, 4))# Unweighted
ggplot(midwest, aes(percwhite, percbelowpoverty)) +
geom_point() +
geom_smooth(method = lm, size = 1)# Weighted by population
ggplot(midwest, aes(percwhite, percbelowpoverty)) +
geom_point(aes(size = poptotal / 1e6)) +
geom_smooth(aes(weight = poptotal), method = lm, size = 1) +
scale_size_area(guide = "none")# unweighted histogram
ggplot(midwest, aes(percbelowpoverty)) +
geom_histogram(binwidth = 1) +
ylab("Counties")# weighted histogram
ggplot(midwest, aes(percbelowpoverty)) +
geom_histogram(aes(weight = poptotal), binwidth = 1) +
ylab("Population (1000s)")There are a number of geoms that can be used to display
distributions, depending on the dimensionality of the distribution,
whether it is continuous or discrete, and whether you are interested in
the conditional or joint distribution. For 1d continuous distributions
the most important geom is the histogram, geom_histogram().
It is important to experiment with binning to find a revealing view. You
can change the binwidth, specify the number of bins, or
specify the exact location of the breaks. Never rely on the default
parameters to get a revealing view of the distribution. When publishing
figures, don’t forget to include information about important parameters
(like bin width) in the caption.
If you want to compare the distribution between groups, you have a few options:
facet_wrap(~ var).geom_freqpoly().geom_histogram(position = "fill").Both the histogram and frequency polygon geom use the same underlying
statistical transformation: stat = "bin". This statistic
produces two output variables: count and
density. By default, count is mapped to y-position, because
it’s most interpretable. The density is the count divided by the total
count multiplied by the bin width, and is useful when you want to
compare the shape of the distributions, not the overall size. An
alternative to a bin-based visualisation is a density estimate.
geom_density() places a little normal distribution at each
data point and sums up all the curves. It has desirable theoretical
properties, but is more difficult to relate back to the data. Use a
density plot when you know that the underlying density is smooth,
continuous and unbounded. You can use the adjust parameter
to make the density more or less smooth. Note that the area of
each density estimate is standardised to one so that you lose
information about the relative size of each group.
The histogram, frequency polygon and density display a detailed view of the distribution. However, sometimes you want to compare many distributions, and it’s useful to have alternative options that sacrifice quality for quantity. Here are three options:
geom_boxplot(): the box-and-whisker plot shows five
summary statistics along with individual “outliers”. It displays far
less information than a histogram, but also takes up much less space.
You can use boxplot with both categorical and continuous x. For
continuous x, you’ll also need to set the group aesthetic
to define how the x variable is broken up into bins. A useful helper
function is cut_width().
geom_violin(): the violin plot is a compact version
of the density plot. The underlying computation is the same, but the
results are displayed in a similar fashion to the boxplot.
geom_dotplot(): draws one point for each
observation, carefully adjusted in space to avoid overlaps and show the
distribution. It is useful for smaller datasets.
# showing distribution (default binwidth)
ggplot(diamonds, aes(depth)) +
geom_histogram()# choosing binwidth
ggplot(diamonds, aes(depth)) +
geom_histogram(binwidth = 0.1) +
xlim(55, 70)# comparing distribution between groups
# frequency polygon
ggplot(diamonds, aes(depth)) +
geom_freqpoly(aes(colour = cut), binwidth = 0.1, na.rm = TRUE) +
xlim(58, 68) +
theme(legend.position = "none")# conditional density plot
ggplot(diamonds, aes(depth)) +
geom_histogram(aes(fill = cut), binwidth = 0.1, position = "fill",
na.rm = TRUE) +
xlim(58, 68) +
theme(legend.position = "none")# geom density
ggplot(diamonds, aes(depth)) +
geom_density(na.rm = TRUE) +
xlim(58, 68) +
theme(legend.position = "none")ggplot(diamonds, aes(depth, fill = cut, colour = cut)) +
geom_density(alpha = 0.2, na.rm = TRUE) +
xlim(58, 68) +
theme(legend.position = "none")# boxplot
ggplot(diamonds, aes(clarity, depth)) +
geom_boxplot()ggplot(diamonds, aes(carat, depth)) +
geom_boxplot(aes(group = cut_width(carat, 0.1))) +
xlim(NA, 2.05)# violin plot
ggplot(diamonds, aes(clarity, depth)) +
geom_violin()ggplot(diamonds, aes(carat, depth)) +
geom_violin(aes(group = cut_width(carat, 0.1))) +
xlim(NA, 2.05)# dot plot
ggplot(mtcars, aes(mpg)) +
geom_dotplot(dotsize = 1.5, stackdir = "up")
#### Dealing with overplotting
The scatterplot is a very important tool for assessing the relationship between two continuous variables. However, when the data is large, points will be often plotted on top of each other, obscuring the true relationship. In extreme cases, you will only be able to see the extent of the data, and any conclusions drawn from the graphic will be suspect. This problem is called overplotting.
There are a number of ways to deal with it depending on the size of the data and severity of the overplotting. The first set of techniques involves tweaking aesthetic properties. These tend to be most effective for smaller datasets:
Very small amounts of overplotting can sometimes be alleviated by making the points smaller, or using hollow glyphs.
For larger datasets with more overplotting, you can use alpha blending (transparency) to make the points transparent. If you specify alpha as a ratio, the denominator gives the number of points that must be overplotted to give a solid colour. Values smaller than ~ 1/500 are rounded down to zero, giving completely transparent points.
If there is some discreteness in the data, you can randomly
jitter the points to alleviate some overlaps with
geom_jitter(). This can be particularly useful in
conjunction with transparency. By default, the amount of jitter added is
40% of the resolution of the data, which leaves a small gap between
adjacent regions. You can override the default with width and height
arguments.
Another approach to dealing with overplotting is to add data
summaries to help guide the eye to the true shape of the pattern within
the data. For example, you could add a smooth line showing the centre of
the data with geom_smooth() or use one of the summaries
below.
# getting the data
df <- data.frame(x = rnorm(2000), y = rnorm(2000))
# making the points smaller
norm <- ggplot(df, aes(x, y)) + xlab(NULL) + ylab(NULL)
norm + geom_point()norm + geom_point(shape = 1) # Hollow circlesnorm + geom_point(shape = ".") # Pixel sized# setting alpha (the denominator gives the number of points that must be overplotted to give a solid colour)
norm + geom_point(alpha = 1 / 3)norm + geom_point(alpha = 1 / 5)norm + geom_point(alpha = 1 / 10)# setting geom_jitter()
norm + geom_point() +
geom_jitter()norm + geom_point() +
geom_jitter(width = 0.7, height = 0.7)geom_histogram() and geom_bin2d() use a
familiar geom, geom_bar() and geom_raster(),
combined with a new statistical transformation, stat_bin()
and stat_bin2d(). stat_bin() and
stat_bin2d() combine the data into bins and count the
number of observations in each bin. But what if we want a summary other
than count? So far, we’ve just used the default statistical
transformation associated with each geom. Now we’re going to explore how
to use stat_summary_bin() to stat_summary_2d()
to compute different summaries. You can control the size of the bins and
the summary functions. stat_summary_bin() can produce
y, ymin and ymax aesthetics, also
making it useful for displaying measures of spread.
# counting
ggplot(diamonds, aes(color)) +
geom_bar()ggplot(diamonds, aes(table, depth)) +
geom_bin2d(binwidth = 1, na.rm = TRUE) +
xlim(50, 70) +
ylim(50, 70)# computing the average price
ggplot(diamonds, aes(color, price)) +
geom_bar(stat = "summary_bin", fun = mean)ggplot(diamonds, aes(table, depth, z = price)) +
geom_raster(binwidth = 1, stat = "summary_2d", fun = mean,
na.rm = TRUE) +
xlim(50, 70) +
ylim(50, 70)So far we’ve considered two classes of geoms:
Simple geoms where there’s a one-on-one correspondence between rows in the data frame and physical elements of the geom
Statistical geoms where introduce a layer of statistical summaries in between the raw data and the result
Now we’ll consider cases where a visualisation of a three dimensional surface is required. The ggplot2 package does not support true 3d surfaces, but it does support many common tools for summarising 3d surfaces in 2d: contours, coloured tiles and bubble plots. These all work similarly, differing only in the aesthetic used for the third dimension. The reference to the ..level.. variable in this code may seem confusing, because there is no variable called ..level.. in the faithfuld data. In this context the .. notation refers to a variable computed internally.
# contour plot
ggplot(faithfuld, aes(eruptions, waiting)) +
geom_contour(aes(z = density, colour = ..level..))# heat map
ggplot(faithfuld, aes(eruptions, waiting)) +
geom_raster(aes(fill = density))# bubble plot
# Bubble plots work better with fewer observations
# getting the data
small <- faithfuld[seq(1, nrow(faithfuld), by = 10), ]
ggplot(small, aes(eruptions, waiting)) +
geom_point(aes(size = density), alpha = 1/3) +
scale_size_area()Plotting geospatial data is a common visualisation task, and one that requires specialised tools. Typically the problem can be decomposed into two problems: using one data source to draw a map, and adding metadata from another information source to the map.
Perhaps the simplest approach to drawing maps is to use
geom_polygon() to draw boundaries for different regions. For this
example we take data from the maps package using
ggplot2::map_data(). In this data set we have four
variables: lat and long specify the latitude and longitude of a vertex
(i.e. a corner of the polygon), id specifies the name of a region, and
group provides a unique identifier for contiguous areas within a region
(e.g. if a region consisted of multiple islands). To get a better sense
of what the data contains, we can plot mi_counties using geom_point(),
as shown in the left panel below. In this plot, each row in the data
frame is plotted as a single point, producing a scatterplot that shows
the corners of every county. To turn this scatterplot into a map, we use
geom_polygon() instead, which draws each county as a
distinct polygon.
There are a few limitations to the approach outlined above, not least
of which is the fact that the simple “longitude-latitude” data format is
not typically used in real world mapping. Vector data for maps are
typically encoded using the “simple features” standard produced by the
Open Geospatial Consortium. The sf package developed by
Edzer Pebesma https://github.com/r-spatial/sf provides an excellent
toolset for working with such data, and the geom_sf() and
coord_sf() functions in ggplot2 are
designed to work together with the sf package. One
advantage to sf data is immediately apparent, we can
easily see the overall structure of the data.
To introduce these functions, we rely on the ozmaps
package by Michael Sumner https://github.com/mdsumner/ozmaps/ which provides maps
for Australian state boundaries, local government areas, electoral
boundaries, and so on. The data is essentially a tibble with 9 rows and
2 columns. There are 9 distinct geographical units, so there are 9 rows
in this tibble. The most important column is geometry,
which specifies the spatial geometry for each of the states and
territories. Each element in the geometry column is a
multipolygon object which, as the name suggests, contains
data specifying the vertices of one or more polygons that demark the
border of a region. Given data in this format, we can use
geom_sf() and coord_sf() to draw a serviceable
map without specifying any parameters or even explicitly declaring any
aesthetics. To understand why this works, note that
geom_sf() relies on a geometry aesthetic that is not used
elsewhere in ggplot2. This aesthetic can be specified
in one of three ways:
geom_sf() will attempt to map it to a column named
geometry.geom_sf() can
automatically detect a geometry column, even if it’s not
called geometry.aes(geometry = my_column). This is useful if you have
multiple geometry columns.The coord_sf() function governs the map projection.
In some instances you may want to overlay one map on top of another.
The ggplot2 package supports this by allowing you to
add multiple geom_sf() layers to a plot. As an example,
I’ll use the oz_states data to draw the Australian states in different
colours, and will overlay this plot with the boundaries of Australian
electoral regions. We use two map layers: the first uses
oz_states to fill the states in different colours, and the
second uses oz_votes to draw the electoral boundaries.
Adding labels to maps is an example of annotating plots and is
supported by geom_sf_label() and
geom_sf_text(). Though geom_sf() is special in
some ways, it nevertheless behaves in much the same fashion as any other
geom, allowing additional data to be plotted on a map with standard
geoms.
At the start we drew maps by plotting longitude and latitude on a Cartesian plane, as if geospatial data were no different to other kinds of data one might want to plot. To a first approximation this is okay, but it’s not good enough if you care about accuracy. There are two fundamental problems with the approach.
The first issue is the shape of the planet. The Earth is neither a flat plane, nor indeed is it a perfect sphere. As a consequence, to map a co-ordinate value (longitude and latitude) to a location we need to make assumptions about all kinds of things. How ellipsoidal is the Earth? Where is the centre of the planet? Where is the origin point for longitude and latitude? Where is the sea level? How do the tectonic plates move? All these things are relevant, and depending on what assumptions one makes the same co-ordinate can be mapped to locations that are many meters apart. The set of assumptions about the shape of the Earth is referred to as the geodetic datum and while it might not matter for some data visualisations, for others it is critical. There are several different choices one might consider: if your focus is North America the “North American Datum” (NAD83) is a good choice, whereas if your perspective is global the “World Geodetic System” (WGS84) is probably better.
The second issue is the shape of your map. The Earth is approximately ellipsoidal, but in most instances your spatial data need to be drawn on a two dimensional plane. It is not possible to map the surface of an ellipsoid to a plane without some distortion or cutting, and you will have to make choices about what distortions you are prepared to accept when drawing a map. This is the job of the map projection.
Map projections are often classified in terms of the geometric properties that they preserve:
And unfortunately, it’s not possible for any projection to be shape-preserving and area-preserving. Taken together, the geodetic datum (e.g, WGS84), the type of map projection (e.g., Mercator) and the parameters of the projection (e.g., location of the origin) specify a coordinate reference system, or CRS, a complete set of assumptions used to translate the latitude and longitude information into a two dimensional map. An sf object often includes a default CRS. Most of this output corresponds to a well-known text (WKT) string that unambiguously describes the CRS. This verbose WKT representation is used by sf internally, but there are several ways to provide user input that sf understands. One such method is to provide numeric input in the form of an EPSG code (see http://www.epsg.org/). The default CRS in the oz_votes data corresponds to EPSG code 4283.
In ggplot2, the CRS is controlled by
coord_sf(), which ensures that every layer in the plot uses
the same projection. By default, coord_sf() uses the CRS
associated with the geometry column of the data. Because sf data
typically supply a sensible choice of CRS, this process usually unfolds
invisibly, requiring no intervention from the user. However, should you
need to set the CRS yourself, you can specify the crs parameter by
passing valid user input to st_crs().
Maps created using geom_sf() and coord_sf()
rely heavily on tools provided by the sf package, and
indeed the sf package contains many more useful tools
for manipulating simple features data. One advantage to simple features
over other representations of spatial data is that geographical units
can have complicated structure. The metadata for a sf object can
accessed using helper functions. For example,
st_geometry_type() extracts the geometry type (e.g.,
MULTIPOLYGON), st_dimension() extracts the number of
dimensions (2 for XY data, 3 for XYZ), st_bbox() extracts
the bounding box as a numeric vector, and st_crs() extracts
the CRS as a list with two components, one for the EPSG code and the
other for the proj4string. We can “cast” the MULTIPOLYGON into the two
distinct POLYGON geometries from which it is constructed using
st_cast().
A second way to supply geospatial information for mapping is to rely
on raster data. Unlike the simple features format, in which geographical
entities are specified in terms of a set of lines, points and polygons,
rasters take the form of images. In the simplest case raster data might
be nothing more than a bitmap file, but there are many different image
formats out there. In the geospatial context specifically, there are
image formats that include metadata (e.g., geodetic datum, coordinate
reference system) that can be used to map the image information to the
surface of the Earth. For example, one common format is GeoTIFF, which
is a regular TIFF file with additional metadata supplied. Happily, most
formats can be easily read into R with the assistance of GDAL (the
Geospatial Data Abstraction Library, https://gdal.org/). For example the sf
package contains a function sf::gdal_read() that provides
access to the GDAL raster drivers from R. However, you rarely need to
call this function directly, as there are other high level functions
that take care of this for you.
# getting the data
mi_counties <- map_data("county", "michigan") %>%
dplyr::select(lon = long, lat, group, id = subregion)
head(mi_counties)## lon lat group id
## 1 -83.88675 44.85686 1 alcona
## 2 -83.36536 44.86832 1 alcona
## 3 -83.36536 44.86832 1 alcona
## 4 -83.33098 44.83968 1 alcona
## 5 -83.30806 44.80530 1 alcona
## 6 -83.30233 44.77665 1 alcona
# plotting with geom_point()
ggplot(mi_counties, aes(lon, lat)) +
geom_point(size = .25, show.legend = FALSE) +
coord_quickmap()# plotting with geom_polygon()
ggplot(mi_counties, aes(lon, lat, group = group)) +
geom_polygon(fill = "white", colour = "grey50") +
coord_quickmap()# looking at ozmap data structure
oz_states <- ozmaps::ozmap_states
oz_states## Simple feature collection with 9 features and 1 field
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 105.5507 ymin: -43.63203 xmax: 167.9969 ymax: -9.229287
## Geodetic CRS: GDA94
## # A tibble: 9 × 2
## NAME geometry
## * <chr> <MULTIPOLYGON [°]>
## 1 New South Wales (((150.7016 -35.12286, 150.6611 -35.11782, 150.6…
## 2 Victoria (((146.6196 -38.70196, 146.6721 -38.70259, 146.6…
## 3 Queensland (((148.8473 -20.3457, 148.8722 -20.37575, 148.85…
## 4 South Australia (((137.3481 -34.48242, 137.3749 -34.46885, 137.3…
## 5 Western Australia (((126.3868 -14.01168, 126.3625 -13.98264, 126.3…
## 6 Tasmania (((147.8397 -40.29844, 147.8902 -40.30258, 147.8…
## 7 Northern Territory (((136.3669 -13.84237, 136.3339 -13.83922, 136.3…
## 8 Australian Capital Territory (((149.2317 -35.222, 149.2346 -35.24047, 149.271…
## 9 Other Territories (((167.9333 -29.05421, 167.9188 -29.0344, 167.93…
# drawing a map with geom_sf()
ggplot(oz_states) +
geom_sf() +
coord_sf()# layering maps
# wrangling data
oz_states <- ozmaps::ozmap_states %>%
filter(NAME != "Other Territories")
oz_votes <- rmapshaper::ms_simplify(ozmaps::abs_ced) # to reduce the time taken to render the plot
ggplot() +
geom_sf(data = oz_states, mapping = aes(fill = NAME), show.legend = FALSE) +
geom_sf(data = oz_votes, fill = NA) +
coord_sf()# labelling maps
# filter electorates in the Sydney metropolitan region
sydney_map <- ozmaps::abs_ced %>%
filter(NAME %in% c(
"Sydney", "Wentworth", "Warringah", "Kingsford Smith", "Grayndler", "Lowe",
"North Sydney", "Barton", "Bradfield", "Banks", "Blaxland", "Reid",
"Watson", "Fowler", "Werriwa", "Prospect", "Parramatta", "Bennelong",
"Mackellar", "Greenway", "Mitchell", "Chifley", "McMahon"
))
# draw the electoral map of Sydney
ggplot(sydney_map) +
geom_sf(aes(fill = NAME), show.legend = FALSE) +
coord_sf(xlim = c(150.97, 151.3), ylim = c(-33.98, -33.79)) + # to zoom
geom_sf_label(aes(label = NAME), label.padding = unit(1, "mm"))# adding other geoms
oz_capitals <- tibble::tribble(
~city, ~lat, ~lon,
"Sydney", -33.8688, 151.2093,
"Melbourne", -37.8136, 144.9631,
"Brisbane", -27.4698, 153.0251,
"Adelaide", -34.9285, 138.6007,
"Perth", -31.9505, 115.8605,
"Hobart", -42.8821, 147.3272,
"Canberra", -35.2809, 149.1300,
"Darwin", -12.4634, 130.8456,
)
ggplot() +
geom_sf(data = oz_votes) +
geom_sf(data = oz_states, colour = "black", fill = NA) +
geom_point(data = oz_capitals, mapping = aes(x = lon, y = lat), colour = "red") +
coord_sf()# looking at the default CRS
st_crs(oz_votes)## Coordinate Reference System:
## User input: EPSG:4283
## wkt:
## GEOGCRS["GDA94",
## DATUM["Geocentric Datum of Australia 1994",
## ELLIPSOID["GRS 1980",6378137,298.257222101,
## LENGTHUNIT["metre",1]]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## CS[ellipsoidal,2],
## AXIS["geodetic latitude (Lat)",north,
## ORDER[1],
## ANGLEUNIT["degree",0.0174532925199433]],
## AXIS["geodetic longitude (Lon)",east,
## ORDER[2],
## ANGLEUNIT["degree",0.0174532925199433]],
## USAGE[
## SCOPE["Horizontal component of 3D system."],
## AREA["Australia including Lord Howe Island, Macquarie Islands, Ashmore and Cartier Islands, Christmas Island, Cocos (Keeling) Islands, Norfolk Island. All onshore and offshore."],
## BBOX[-60.56,93.41,-8.47,173.35]],
## ID["EPSG",4283]]
# looking at EPSG code
st_crs(oz_votes) == st_crs(4283)## [1] TRUE
# switching the default CRS
ggplot(oz_votes) + geom_sf()ggplot(oz_votes) + geom_sf() + coord_sf(crs = st_crs(3112))# more features of sf()
# getting the data
edenmonaro <- ozmaps::abs_ced %>% filter(NAME == "Eden-Monaro")
# example of a MULTIPOLYGON geometry
p <- ggplot(edenmonaro) + geom_sf()
p + coord_sf(xlim = c(147.75, 150.25), ylim = c(-37.5, -34.5)) p + coord_sf(xlim = c(150, 150.25), ylim = c(-36.3, -36))edenmonaro <- edenmonaro %>% pull(geometry)
edenmonaro## Geometry set for 1 feature
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 147.6874 ymin: -37.50503 xmax: 150.2307 ymax: -34.53558
## Geodetic CRS: GDA94
sf::st_bbox(edenmonaro)## xmin ymin xmax ymax
## 147.68741 -37.50503 150.23068 -34.53558
sf::st_dimension(edenmonaro)## [1] 2
sf::st_geometry_type(edenmonaro)## [1] MULTIPOLYGON
## 18 Levels: GEOMETRY POINT LINESTRING POLYGON MULTIPOINT ... TRIANGLE
# casting sf object
sf::st_cast(edenmonaro, "POLYGON")## Geometry set for 2 features
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: 147.6874 ymin: -37.50503 xmax: 150.2307 ymax: -34.53558
## Geodetic CRS: GDA94
# example casting (breaking the whole map into the constituent polygons)
# whole map
dawson <- ozmaps::abs_ced %>%
filter(NAME == "Dawson") %>%
pull(geometry)
dawson## Geometry set for 1 feature
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 146.761 ymin: -21.21307 xmax: 149.9114 ymax: -19.18582
## Geodetic CRS: GDA94
ggplot(dawson) +
geom_sf() +
coord_sf()# casting mainland and letting only the islands
dawson <- sf::st_cast(dawson, "POLYGON")
which.max(sf::st_area(dawson))## [1] 69
ggplot(dawson[-69]) +
geom_sf() +
coord_sf()# raster maps
# giscoR::gisco_get_countries(country = "Australia") %>%
# elevatr::get_elev_raster(z = 5, clip = "location") %>%
# rayshader::raster_to_matrix() %>%
# rayshader::sphere_shade() %>%
# rayshader::plot_map()Like maps and spatial data, networks and graphs occupy a special part of the visualization landscape, but whereas spatial data mostly differ from regular plotting in their use of projections, networks bring their own data structure as well as their own visualization paradigms to the table. Because of these complications networks are not directly supported in ggplot2. Several efforts over the years have tried to add this missing piece and in this chapter we will see how to use ggraph for network visualization. Other packages that offer some of the same functionality includes geomnet, ggnetwork, and GGally for regular network plots, and ggtree and ggdendro for tree visualization specifically.
Networks (or graphs as their mathematical concept is called) are data that consists of entities (nodes or vertices) and their relation (edges or links). Both nodes and edges can have additional data attached, and edges can furthermore be considered directed or undirected depending on the nature of the connection (a network encoding mutual friendship would have undirected edges, whereas an ancestor network will have directed edges because child-of is not a symmetrical relation).
The nature of network data means that it is not readily representable in a single data frame, which is one of the key complications to using it with ggplot2. However, it can be encoded as two interrelated data frames, one encoding the nodes, and one encoding the edges. This is the approach used in tidygraph, which is the data-manipulation package underlying ggraph. To make better use of ggraph it is thus beneficial to understand a little about tidygraph. tidygraph can be considered first and foremost a dplyr API for network data, allowing the same semantics for manipulating networks as is known from dplyr. Network data is often presented in a range of different formats depending on where you get it from. tidygraph understands most of the different classes used in R for network data and these can be converted using as_tbl_graph().
While simply manipulating networks is nice, the real benefit of
networks comes from the different operations that can be performed on
them using the underlying structure. tidygraph has rich
support for a range of different groups of algorithms such as centrality
calculation (which node is most central), ranking (order nodes so nodes
are located close to those they are connected to), grouping (finding
clusters inside the network), etc. The algorithm API is designed to be
used inside mutate() and will always return a vector with
length and order matching the nodes or edges. Further, it does not
require you to specify the graph or nodes you want to calculate for
since this is given implicitly in the mutate() call.
ggraph builds on top of tidygraph and ggplot2 to allow a complete and familiar grammar of graphics for network data. Still, it is a little different from most ggplot2 extension packages since it works with another data type that is fundamentally different from tabular data. More so, most network visualizations don’t concern themselves with mapping variables to x and y aesthetics since they are concerned with showing the network topology more than relations between two variables. In order to show network topology the concept of layouts are employed. Layouts are algorithms that use the network structure to calculate (often arbitrary) x and y values for each node that can then be used for visualization purposes. To put it in another way, when plotting tabular data the x and y aesthetics are almost always mapped to existing variables in the data (or statistical transformations of existing data) whereas when plotting network data x and y are mapped to values derived from the topology of the network and which are by themselves meaningless.
Whereas a normal ggplot2 plot is initialized with a
ggplot() call, a ggraph plot is
initialized with a ggraph() call. The first argument is the
data, which can be a tbl_graph or any object convertible to one. The
second argument is a layout function and any further arguments will be
passed on to that function. The default layout will choose an
appropriate layout based on the type of graph you provide, but while it
is often a decent starting point you should always take control and
explore the different layouts available — networks are notorious for
their ability to show non-existing or exaggerated relations in some
layouts. The layout argument can either take a string or a function. If
a string is provided, the name will be matched to one of the build in
layouts (of which there are many). If a function is provided it is
assumed that the function takes a tbl_graph and returns a
data frame with at least an x and y column and with the same number of
rows as there are nodes in the input graph.
Some layouts may be used in both a linear and circular version. The
correct way to change this in ggplot2 would be to use
coord_polar() to change the coordinate system, but since we
only want to change the position of nodes in the layout, and not affect
the edges, this is a function of the layout.
Of the two types of data stored in a graph, nodes are by far the ones
that is most alike to what we are used to plotting. After all, they are
often shown as points in very much the same way as observations are
displayed in a scatter plot. All the node drawing geoms in
ggraph are prefixed with geom_node_ and
the one you are most likely to use the most is
geom_node_point(). While it may superficially look a lot
like geom_point() it has some additional features that it
shares with all node and edge geoms. First, you don’t have to specify
the x and y aesthetics. These are given by the layout and their mapping
is implicit. Second, you have access to a filter aesthetic that allows
you to turn off the drawing of specific nodes. Third, you may use any
tidygraph algorithms inside the aes() and
they will get evaluated on the graph being visualized. Being able to use
algorithms directly inside the visualization code is a powerful way to
iterate on your visualization as you don’t need to go back and change
the input graph. Apart from points, there are more specialized geoms,
many tied to a specific type of layout.
Edge geoms have a lot more bells and whistles than node geoms, mainly
because there are so many different ways one can connect two things.
Edge geoms have access to the variables of the terminal nodes through
specially prefixed variables. For the standard and 0 version these are
available through node1. and node2. prefixed
variables, and for the 2 version they are available through node.
prefixed variables (as used above). The three versions of edge geoms are
common to all edge geom types, not just geom_edge_link().
There are more ways to draw edges than simple straight lines. Some are
specific to trees or specific layouts, but many are general purpose.
These geoms should only be used for relatively simple graphs since they
increase the amount of clutter and overplotting in the plot. A common
issue, especially when using arrows to show directionality of edges, is
that the node will overlap the edge because it runs to the center of the
node, not the edge of the point showing the node. Hence we would like
the edges to stop before they reach the point so that the arrow is not
obscured. This is possible in ggraph using the
start_cap and end_cap aesthetics which allow you to specify a clipping
region around the terminal nodes.
While it is natural to think of edges as different kinds of lines connecting points, this is only true for certain network plot types. One should always be mindful that nodes and edges are abstract concepts and can be visualized in a multitude of ways.
Faceting is not a concept often applied to network visualization, but
it is just as powerful for networks as it is for tabular data. While the
standard faceting functions in ggplot2 do technically
work with ggraph, they do not on a conceptual level,
since nodes and edges are connected and splitting nodes on multiple
subplots will automatically move edges with them even though the edges
do not have the faceting variable in their data. Because of this,
ggraph provides its own specialized versions of
facet_wrap() and facet_grid().
facet_nodes() and facet_edges() are will
target either nodes or edges and wrap the panels in the same manner as
facet_wrap(). For facet_nodes() the convention
is that if an edge goes between two nodes in the same panel it will be
shown in that panel, but if it is split between multiple panels it will
be removed. For facet_edges() nodes will be repeated in all
panels.
# simple example of network using tidygraph
graph <- play_erdos_renyi(n = 10, p = 0.2) %>%
activate(nodes) %>% # working on nodes
mutate(class = sample(letters[1:4], n(), replace = TRUE)) %>%
activate(edges) %>% # working on edges
arrange(.N()$class[from])
graph## # A tbl_graph: 10 nodes and 21 edges
## #
## # A directed simple graph with 1 component
## #
## # Edge Data: 21 × 2 (active)
## from to
## <int> <int>
## 1 4 1
## 2 1 3
## 3 4 10
## 4 1 6
## 5 4 6
## 6 1 8
## # … with 15 more rows
## #
## # Node Data: 10 × 1
## class
## <chr>
## 1 a
## 2 d
## 3 c
## # … with 7 more rows
# getting the data
data(highschool, package = "ggraph")
head(highschool)## from to year
## 1 1 14 1957
## 2 1 15 1957
## 3 1 21 1957
## 4 1 54 1957
## 5 1 55 1957
## 6 2 21 1957
# converting a data frame into an edge list
hs_graph <- as_tbl_graph(highschool, directed = FALSE)
hs_graph## # A tbl_graph: 70 nodes and 506 edges
## #
## # An undirected multigraph with 1 component
## #
## # Node Data: 70 × 1 (active)
## name
## <chr>
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
## # … with 64 more rows
## #
## # Edge Data: 506 × 3
## from to year
## <int> <int> <dbl>
## 1 1 13 1957
## 2 1 14 1957
## 3 1 20 1957
## # … with 503 more rows
# converting the result of hclust()
luv_clust <- hclust(dist(luv_colours[, 1:3]))
luv_graph <- as_tbl_graph(luv_clust)
luv_graph## # A tbl_graph: 1313 nodes and 1312 edges
## #
## # A rooted tree
## #
## # Node Data: 1,313 × 4 (active)
## height leaf label members
## <dbl> <lgl> <chr> <int>
## 1 0 TRUE "101" 1
## 2 0 TRUE "427" 1
## 3 778. FALSE "" 2
## 4 0 TRUE "571" 1
## 5 0 TRUE "426" 1
## 6 0 TRUE "424" 1
## # … with 1,307 more rows
## #
## # Edge Data: 1,312 × 2
## from to
## <int> <int>
## 1 3 1
## 2 3 2
## 3 8 6
## # … with 1,309 more rows
# calculating the centrality of the nodes
graph %>%
activate(nodes) %>%
mutate(centrality = centrality_pagerank()) %>%
arrange(desc(centrality))## # A tbl_graph: 10 nodes and 21 edges
## #
## # A directed simple graph with 1 component
## #
## # Node Data: 10 × 2 (active)
## class centrality
## <chr> <dbl>
## 1 b 0.184
## 2 c 0.158
## 3 a 0.151
## 4 b 0.102
## 5 c 0.0905
## 6 a 0.0897
## # … with 4 more rows
## #
## # Edge Data: 21 × 2
## from to
## <int> <int>
## 1 6 3
## 2 3 5
## 3 6 4
## # … with 18 more rows
# plotting networks
# default layout
ggraph(hs_graph) +
geom_edge_link() +
geom_node_point()# different layouts
ggraph(hs_graph, layout = "drl") +
geom_edge_link() +
geom_node_point()hs_graph <- hs_graph %>%
activate(edges) %>%
mutate(edge_weights = runif(n()))
ggraph(hs_graph, layout = "stress", weights = edge_weights) +
geom_edge_link(aes(alpha = edge_weights)) +
geom_node_point() +
scale_edge_alpha_identity()# circular version
ggraph(luv_graph, layout = 'dendrogram', circular = TRUE) +
geom_edge_link() +
coord_fixed()# plotting nodes
ggraph(hs_graph, layout = "stress") +
geom_edge_link() +
geom_node_point(
aes(
filter = centrality_degree() > 2,
colour = centrality_power()
),
size = 4
)# plotting a treemap
ggraph(luv_graph, layout = "treemap") +
geom_node_tile(aes(fill = depth))# plotting edges
ggraph(graph, layout = "stress") +
geom_edge_link(aes(alpha = after_stat(index)))ggraph(graph, layout = "stress") +
geom_edge_link2(
aes(colour = node.class),
width = 3,
lineend = "round"
)# showing parallel edges
ggraph(hs_graph, layout = "stress") +
geom_edge_fan()ggraph(hs_graph, layout = "stress") +
geom_edge_parallel()# plotting a dendogram
ggraph(luv_graph, layout = "dendrogram", height = height) +
geom_edge_elbow()# clipping edges around the nodes
ggraph(graph, layout = "stress") +
geom_edge_link(
arrow = arrow(),
start_cap = circle(5, "mm"),
end_cap = circle(5, "mm")
) +
geom_node_point(aes(colour = class), size = 8)# looking at matrix plots
ggraph(hs_graph, layout = "matrix", sort.by = node_rank_traveller()) +
geom_edge_point()# faceting
ggraph(hs_graph, layout = "stress") +
geom_edge_link() +
geom_node_point() +
facet_edges(~year)When constructing a data visualisation, it is often necessary to make annotations to the data displayed. Conceptually, an annotation supplies metadata for the plot: that is, it provides additional information about the data being displayed. From a practical standpoint, however, metadata is just another form of data. Because of this, the annotation tools in ggplot2 reuse the same geoms that are used to create other plots. However, to meet the specific needs that users often have when annotating plots, there are some helper functions in ggplot2 itself, and a number of other packages have extended ggplot2 in ways you may find helpful.
When customising a plot, it is often useful to modify the titles
associated with the plot, axes, and legends. To assist with this task
ggplot2 provides the labs() helper
function, which lets you set the various titles using name-value pairs
like title = My plot title",
x = "X axis" or fill = "fill legend". The values supplied
to labs() are typically text strings, with \n
used to specify line breaks, but you can also supply mathematical
expressions wrapped in quote(). The rules by which these
expressions are interpreted can be found by typing
?plotmath. It is also possible to include (some) markdown
in axis and legend titles with the help of the ggtext
package and the ggplot2 theme system. To enable
markdown you need to set the relevant theme element to
ggtext::element_markdown(). There are two ways to remove
the axis label. Setting labs(x = "") omits the label but
still allocates space; setting labs(x = NULL) removes the
label and its space.
Adding text to a plot is one of the most common forms of annotation.
Most plots will not benefit from adding text to every single observation
on the plot, but labelling outliers and other important points is very
useful. However, text annotation can be tricky due to the way that R
handles fonts. The ggplot2 package doesn’t have all the
answers, but it does provide some tools to make your life a little
easier. The main tool for labelling plots is geom_text(),
which adds label text at the specified x and y positions.
geom_text() has the most aesthetics of any geom, because
there are so many ways to control the appearance of a text:
The family aesthetic provides the name of a font.
This aesthetic does allow you to use the name of a system font, but some
care is required. There are only three fonts that are guaranteed to work
everywhere: “sans” (the default), “serif”, or
“mono”. The reason that it can be tricky to use system
fonts in a plot is that text drawing is handled differently by each
graphics device (GD). There are two groups of GDs: screen devices such
as windows() (for Windows), quartz() (for Macs), x11() (mostly for
Linux) and RStudioGD() (within RStudio) draw the plot to the screen,
whereas file devices such as png() and pdf() write the plot to a file.
Unfortunately, the devices do not specify fonts in the same way so if
you want a font to work everywhere you need to configure the devices in
different ways. Two packages simplify the quandary a bit: showtext and extrafont.
The fontface aesthetic specifies the face, and can
take three values: “plain” (the default),
“bold” or “italic”.
You can adjust the alignment of the text with the
hjust (“left”, “center”, “right”, “inward”, “outward”) and
vjust (“bottom”, “middle”, “top”, “inward”, “outward”)
aesthetics. By default the aligment is centered, but there are often
good reasons to override this. One of the most useful alignments is
“inward”. It aligns text towards the middle of the plot, which ensures
that labels remain within the plot limits.
The font size is controlled by the size aesthetic. Unlike most tools, ggplot2 specifies the size in millimeters (mm), rather than the usual points (pts). The reason for this choice is that it makes it the units for font sizes consistent with how other sizes are specified in ggplot2. (There are 72.27 pts in a inch, so to convert from points to mm, just multiply by 72.27 / 25.4).
angle specifies the rotation of the text in
degrees.
The ggplot2 package does allow you to map data
values to the aesthetics used by geom_text(), but you
should use restraint: it is hard to perceive the relationship between
variables mapped to these aesthetics, and rarely useful to do so. In
addition to the various aesthetics, geom_text() has three
parameters that you can specify. Unlike the aesthetics these only take
single values, so they must be the same for all labels:
Often you want to label existing points on the plot, but you
don’t want the text to overlap with the points (or bars etc). In this
situation it’s useful to offset the text a little, which you can do with
the nudge_x and nudge_y parameters.
The third parameter is check_overlap. If
check_overlap = TRUE, overlapping labels will be
automatically removed from the plot. The algorithm is simple: labels are
plotted in the order they appear in the data frame; if a label would
overlap with an existing point, it’s omitted.
A variation on geom_text() is geom_label():
it draws a rounded rectangle behind the text. This makes it useful for
adding labels to plots with busy backgrounds. Labelling data well poses
some challenges:
Text does not affect the limits of the plot. Unfortunately
there’s no way to make this work since a label has an absolute size
(e.g. 3 cm), regardless of the size of the plot. This means that the
limits of a plot would need to be different depending on the size of the
plot — there’s just no way to make that happen with
ggplot2. Instead, you’ll need to tweak
xlim() and ylim() based on your data and plot
size.
If you want to label many points, it is difficult to avoid
overlaps. check_overlap = TRUE is useful, but offers little
control over which labels are removed. A popular technique for
addressing this is to use the ggrepel
package. The package supplies geom_text_repel(), which
optimizes the label positioning to avoid overlap. It works quite well so
long as the number of labels is not excessive.
It can sometimes be difficult to ensure that text labels fit within the space that you want. The ggfittext package contains useful tools that can assist with this, including functions that allow you to place text labels inside the columns in a bar chart.
Labelling individual points with text is an important kind of annotation, but it is not the only useful technique. The ggplot2 package provides several other tools to annotate plots using the same geoms you would use to display data. For example you can use:
geom_text() and geom_label() to add text,
as illustrated earlier.geom_rect() to highlight interesting rectangular
regions of the plot. geom_rect() has aesthetics
xmin, xmax, ymin and
ymax.geom_line(), geom_path() and
geom_segment() to add lines. All these geoms have an arrow
parameter, which allows you to place an arrowhead on the line. Create
arrowheads with arrow(), which has arguments
angle, length, ends and
type.geom_vline(), geom_hline() and
geom_abline() allow you to add reference lines (sometimes
called rules), that span the full range of the plot.Typically, you can either put annotations in the foreground (using
alpha if needed so you can still see the data), or in the background.
With the default background, a thick white line makes a useful
reference: it’s easy to see but it doesn’t jump out at you.
ggplot2 includes the annotate() helper
function which creates the data frame for you that you can use it to add
a single annotation to a plot. Another common form of annotation is to
highlight a subset of points by drawing larger points in a different
colour underneath the main data set.
“Direct labelling”, in which the plot region itself contains the
labels for groups of points instead of using a legend usually makes the
plot easier to read because it puts the labels closer to the data. The
broader ggplot2 ecosystem contains a variety of other
tools to accomplish this in a more automated fashion. The
directlabels package provides a number of tools to make
this easier. Directlabels provides a number of position methods.
smart.grid is a reasonable place to start for scatterplots,
but there are other methods that are more useful for frequency polygons
and line plots.
# customizing a plot
ggplot(mpg, aes(displ, hwy)) +
geom_point(aes(colour = factor(cyl))) +
labs(
x = "Engine displacement (litres)",
y = "Highway miles per gallon",
colour = "Number of cylinders",
title = "Mileage by engine size and cylinders",
subtitle = "Source: http://fueleconomy.gov"
)# adding other values to labs()
values <- seq(from = -2, to = 2, by = .01)
df <- data.frame(x = values, y = values ^ 3)
ggplot(df, aes(x, y)) +
geom_path() +
labs(y = quote(f(x) == x^3)) # to write mathematical expressions# adding markdown elements to a plot
df <- data.frame(x = 1:3, y = 1:3)
p1 <- ggplot(df, aes(x, y)) +
geom_point() +
labs(x = "Axis title with *italics* and **boldface**")
p2 <- base + theme(axis.title.x = ggtext::element_markdown())
(p1 | p2)# customizing geom_text()
# family
df <- data.frame(x = 1, y = 3:1, family = c("sans", "serif", "mono"))
ggplot(df, aes(x, y)) +
geom_text(aes(label = family, family = family))# fontface
df <- data.frame(x = 1, y = 3:1, face = c("plain", "bold", "italic"))
ggplot(df, aes(x, y)) +
geom_text(aes(label = face, fontface = face))# alignment
df <- data.frame(
x = c(1, 1, 2, 2, 1.5),
y = c(1, 2, 1, 2, 1.5),
text = c(
"bottom-left", "top-left",
"bottom-right", "top-right", "center"
)
)
ggplot(df, aes(x, y)) +
geom_text(aes(label = text))ggplot(df, aes(x, y)) +
geom_text(aes(label = text), vjust = "inward", hjust = "inward")# offsetting the text
df <- data.frame(trt = c("a", "b", "c"), resp = c(1.2, 3.4, 2.5))
ggplot(df, aes(resp, trt)) +
geom_point() +
geom_text(aes(label = paste0("(", resp, ")")), nudge_y = -0.25) +
xlim(1, 3.6)# avoiding overlapping
ggplot(mpg, aes(displ, hwy)) +
geom_text(aes(label = model)) +
xlim(1, 8)ggplot(mpg, aes(displ, hwy)) +
geom_text(aes(label = model), check_overlap = TRUE) +
xlim(1, 8)# drawing a rounded rectangle
label <- data.frame(
waiting = c(55, 80),
eruptions = c(2, 4.3),
label = c("peak one", "peak two")
)
ggplot(faithfuld, aes(waiting, eruptions)) +
geom_tile(aes(fill = density)) +
geom_label(data = label, aes(label = label))# labelling many points with ggrepel()
mini_mpg <- mpg[sample(nrow(mpg), 20), ]
ggplot(mpg, aes(displ, hwy)) +
geom_point(colour = "red") +
ggrepel::geom_text_repel(data = mini_mpg, aes(label = class))# custom annotations
# plotting the base
ggplot(economics, aes(date, unemploy)) +
geom_line()presidential <- subset(presidential, start > economics$date[1])
ggplot(economics) +
geom_rect(
aes(xmin = start, xmax = end, fill = party),
ymin = -Inf, ymax = Inf, alpha = 0.2,
data = presidential
) +
geom_vline(
aes(xintercept = as.numeric(start)),
data = presidential,
colour = "grey50", alpha = 0.5
) +
geom_text(
aes(x = start, y = 2500, label = name),
data = presidential,
size = 3, vjust = 0, hjust = 0, nudge_x = 50
) +
geom_line(aes(date, unemploy)) +
scale_fill_manual(values = c("blue", "red")) +
xlab("date") +
ylab("unemployment")# using annotate()
yrng <- range(economics$unemploy)
xrng <- range(economics$date)
caption <- paste(strwrap("Unemployment rates in the US have
varied a lot over the years", 40), collapse = "\n")
ggplot(economics, aes(date, unemploy)) +
geom_line() +
annotate(
geom = "text", x = xrng[1], y = yrng[2],
label = caption, hjust = 0, vjust = 1, size = 4
)# another example of annotate()
ggplot(mpg, aes(displ, hwy)) +
geom_point(
data = filter(mpg, manufacturer == "subaru"),
colour = "orange",
size = 3
) +
geom_point() +
# annotate(geom = "point", x = 5.5, y = 40, colour = "orange", size = 3) +
# annotate(geom = "point", x = 5.5, y = 40) +
# annotate(geom = "text", x = 5.6, y = 40, label = "subaru", hjust = "left") +
annotate(
geom = "curve", x = 4, y = 35, xend = 2.65, yend = 27,
curvature = .3, arrow = arrow(length = unit(2, "mm"))
) +
annotate(geom = "text", x = 4.1, y = 35, label = "subaru", hjust = "left")# direct labelling with directlabels
p1 <- ggplot(mpg, aes(displ, hwy, colour = class)) +
geom_point()
p2 <- ggplot(mpg, aes(displ, hwy, colour = class)) +
geom_point(show.legend = FALSE) +
directlabels::geom_dl(aes(label = class), method = "smart.grid")
(p1 | p2)# using ggforce
ggplot(mpg, aes(displ, hwy)) +
geom_point() +
ggforce::geom_mark_ellipse(aes(label = cyl, group = cyl))# using gghighlight
data(Oxboys, package = "nlme")
ggplot(Oxboys, aes(age, height, group = Subject)) +
geom_line() +
geom_point() +
gghighlight::gghighlight(Subject %in% 1:3)# adding annotations across facets
mod_coef <- coef(lm(log10(price) ~ log10(carat), data = diamonds))
ggplot(diamonds, aes(log10(carat), log10(price))) +
geom_bin2d() +
geom_abline(
intercept = mod_coef[1], slope = mod_coef[2],
colour = "white", size = 1
) +
facet_wrap(vars(cut), nrow = 1)# another example of facets using gghighlight
ggplot(mpg, aes(displ, hwy, colour = factor(cyl))) +
geom_point() +
gghighlight::gghighlight() +
facet_wrap(vars(cyl))
# Linting
The code in this RMarkdown is linted with the lintr package, which is based on the tidyverse style guide.
# lintr::lint("main.Rmd", linters =
# lintr::with_defaults(
# commented_code_linter = NULL,
# trailing_whitespace_linter = NULL
# )
# )
# # if you have additional scripts and want them to be linted too, add them here
# lintr::lint("scripts/my_script.R")